Rob Baker et al fit growth parameters to B. rapa growth data and then used those to do function-value-trait QTL mapping.
I used those same parameters and queried a mutual rank (MR) gene expression network to find genes that were in a networks associated with these growth paramters. This was done separately for the CR and UN environments. I looked for overlap between those MR-associated genes and growth parameters.
The goal now is to find the trans eQTL for each of the MR-growth associated genes and ask if the trans eQTL overlap with the growth/function-value QTL. I think I am only going to take the top eQTL for each.
Focused on UN genes only.
This is for the CIM eQTL results
For each gene in a MR network with a growth parameter query the eQTL database to find its trans eQTL regions. Then compare overlaps between those and the growth QTL.
Libraries
library(qtl)
library(GenomicRanges)
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel
Attaching package: ‘BiocGenerics’
The following objects are masked from ‘package:parallel’:
clusterApply, clusterApplyLB, clusterCall, clusterEvalQ, clusterExport,
clusterMap, parApply, parCapply, parLapply, parLapplyLB, parRapply,
parSapply, parSapplyLB
The following objects are masked from ‘package:stats’:
IQR, mad, sd, var, xtabs
The following objects are masked from ‘package:base’:
anyDuplicated, append, as.data.frame, basename, cbind, colMeans,
colnames, colSums, dirname, do.call, duplicated, eval, evalq, Filter,
Find, get, grep, grepl, intersect, is.unsorted, lapply, lengths, Map,
mapply, match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
Position, rank, rbind, Reduce, rowMeans, rownames, rowSums, sapply,
setdiff, sort, table, tapply, union, unique, unsplit, which, which.max,
which.min
Loading required package: S4Vectors
Attaching package: ‘S4Vectors’
The following object is masked from ‘package:base’:
expand.grid
Loading required package: IRanges
Loading required package: GenomeInfoDb
library(tidyverse)
[30m── [1mAttaching packages[22m ───────────────────────────────────────────── tidyverse 1.2.1 ──[39m
[30m[32m✔[30m [34mggplot2[30m 3.0.0 [32m✔[30m [34mpurrr [30m 0.2.5
[32m✔[30m [34mtibble [30m 1.4.2 [32m✔[30m [34mdplyr [30m 0.7.6
[32m✔[30m [34mtidyr [30m 0.8.1 [32m✔[30m [34mstringr[30m 1.3.1
[32m✔[30m [34mreadr [30m 1.1.1 [32m✔[30m [34mforcats[30m 0.3.0[39m
[30m── [1mConflicts[22m ──────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[30m [34mdplyr[30m::[32mcollapse()[30m masks [34mIRanges[30m::collapse()
[31m✖[30m [34mdplyr[30m::[32mcombine()[30m masks [34mBiocGenerics[30m::combine()
[31m✖[30m [34mdplyr[30m::[32mdesc()[30m masks [34mIRanges[30m::desc()
[31m✖[30m [34mtidyr[30m::[32mexpand()[30m masks [34mS4Vectors[30m::expand()
[31m✖[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31m✖[30m [34mdplyr[30m::[32mfirst()[30m masks [34mS4Vectors[30m::first()
[31m✖[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()
[31m✖[30m [34mggplot2[30m::[32mPosition()[30m masks [34mBiocGenerics[30m::Position(), [34mbase[30m::Position()
[31m✖[30m [34mpurrr[30m::[32mreduce()[30m masks [34mGenomicRanges[30m::reduce(), [34mIRanges[30m::reduce()
[31m✖[30m [34mdplyr[30m::[32mrename()[30m masks [34mS4Vectors[30m::rename()
[31m✖[30m [34mdplyr[30m::[32mslice()[30m masks [34mIRanges[30m::slice()[39m
library(magrittr)
Attaching package: ‘magrittr’
The following object is masked from ‘package:purrr’:
set_names
The following object is masked from ‘package:tidyr’:
extract
Growth QTL
filepath <- "../input/All2012HeightQTL2.xlsx"
filebase <- filepath %>% basename() %>% str_replace("\\..*$","")
QTLgenes <- readxl::read_excel(filepath)[,-1]
QTLgenes <- QTLgenes %>% dplyr::rename(.id=QTL, FVTtrait=FVT) # change names to match previous file
QTLgenes <- QTLgenes %>% filter(str_detect(FVTtrait,"^UN"))
QTLgenes
MR genes from UN
MR_UN_genes <- read_csv("../output/MR_UN_graphs_node_annotation_2012.csv") %>%
filter(MR_Cutoff <= 50, !duplicated(name)) %>%
mutate(pos=floor((start+end)/2)) %>%
select(MR_Cutoff,name, transcript_chrom=chrom, transcript_pos=pos)
Parsed with column specification:
cols(
.default = col_integer(),
.id = col_character(),
trt = col_character(),
name = col_character(),
chrom = col_character(),
subject = col_character(),
AGI = col_character(),
At_symbol = col_character(),
At_description = col_character(),
perc_ID = col_double(),
eval = col_double(),
score = col_double()
)
See spec(...) for full column specifications.
MR_UN_genes
eQTL
load("../output/scanone-MRgene-qtl_2012.RData")
scanone_CIM_UN <- scanone_MR_cim
scanone_CIM_UN <- scanone_CIM_UN[!str_detect(rownames(scanone_CIM_UN),"loc"),] #downstream code cannot deal with interpolated markers.
Get the scanone data in a nice format for summarizing and plotting
scanone_UN_gather <- scanone_CIM_UN %>%
gather(key = gene, value = LOD, -chr, -pos) %>%
right_join(MR_UN_genes,by=c("gene"="name")) # only keep genes in MR networks
plot eQTL peaks…
pl.UN <- scanone_UN_gather %>%
ggplot(aes(x=pos,y=LOD,color=gene)) +
geom_line() +
geom_hline(aes(yintercept=mean(lod.thrs.cim)),lty=2,lwd=.5,alpha=.5) +
facet_grid( ~ chr, scales="free") +
theme(strip.text.y = element_text(angle=0), axis.text.x = element_text(angle=90)) +
ggtitle("MR gene eQTL")
pl.UN
ggsave(str_c("../output/MR50 gene eQTL UN CIM", Sys.Date(), ".pdf"),width=12,height=8)
ggsave(str_c("../output/MR50 gene eQTL UN CIM", Sys.Date(), ".png"),width=10,height=5)
pl.UN + coord_cartesian(ylim=c(0,10))
plot eQTL peaks for MR30
pl.UN <- scanone_UN_gather %>%
filter(MR_Cutoff <=30) %>%
ggplot(aes(x=pos,y=LOD,color=gene)) +
geom_line() +
geom_hline(aes(yintercept=mean(lod.thrs.cim)),lty=2,lwd=.5,alpha=.5) +
facet_grid( ~ chr, scales="free") +
theme(strip.text.y = element_text(angle=0), axis.text.x = element_text(angle=90)) +
ggtitle("MR gene eQTL")
pl.UN
ggsave(str_c("../output/MR30 gene eQTL UN CIM", Sys.Date(), ".pdf"),width=12,height=8)
ggsave(str_c("../output/MR30 gene eQTL UN CIM", Sys.Date(), ".png"),width=10,height=5)
scanone_UN_gather %>%
arrange(transcript_chrom,transcript_pos,pos) %>%
filter(LOD>mean(lod.thrs.cim)) %>%
group_by(gene,chr) %>%
filter(LOD==max(LOD)) %>%
ungroup() %>%
mutate(transcript_index=row_number(),cis_trans=ifelse(chr==transcript_chrom,"cis","trans")) %>%
ggplot(aes(x=pos,y=transcript_index,shape=cis_trans,color=LOD)) +
scale_color_gradient(low="magenta1",high="magenta4") +
geom_point() +
facet_wrap(~chr,nrow=1) +
theme_bw() +
xlab("QTL position")
ggsave(str_c("../output/MR_gene_eQTL_cistrans_UN_CIM_", Sys.Date(), ".png"),width=10,height = 5)
sig_chromosomes_UN <- scanone_UN_gather %>%
group_by(gene,chr) %>%
summarize(pos=pos[which.max(LOD)],LOD=max(LOD)) %>%
filter(LOD > mean(lod.thrs.cim))
sig_chromosomes_UN
now for each significant chromosome/trait combo run bayesint
bayesint_list_UN <- apply(sig_chromosomes_UN,1,function(hit) {
result <- bayesint(scanone_CIM_UN[c("chr","pos",hit["gene"])],
chr=hit["chr"],
lodcolumn = 1,
prob=0.99,
expandtomarkers = TRUE
)
colnames(result)[3] <- "LOD"
result
})
names(bayesint_list_UN) <- sig_chromosomes_UN$gene
bayesint_list_UN <- lapply(bayesint_list_UN,function(x) x %>%
as.data.frame(stringsAsFactors=FALSE) %>%
rownames_to_column(var="markername") %>%
mutate(chr=as.character(chr))
)
bayesint_result_UN <- as.tibble(bind_rows(bayesint_list_UN,.id="gene")) %>%
select(gene,chr,pos,markername,LOD) %>%
separate(markername,into=c("chr1","Mbp"),sep="x", convert=TRUE) %>%
group_by(gene,chr) %>%
summarize(eQTL_start=min(Mbp),eQTL_end=max(Mbp),min_eQTL_LOD=min(LOD),max_eQTL_LOD=max(LOD)) %>%
#for the high QTL peaks the interval width is 0. That is overly precise and need to widen those.
mutate(eQTL_start=ifelse(eQTL_start==eQTL_end,max(0,eQTL_start-20000),eQTL_start),
eQTL_end=ifelse(eQTL_start==eQTL_end,eQTL_end+20000,eQTL_end))
bayesint_result_UN
Load annotation
BrapaAnnotation <- read_csv("../input/Brapa_V1.5_annotated.csv")
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
X1 = col_integer(),
name = col_character(),
chrom = col_character(),
start = col_integer(),
end = col_integer(),
subject = col_character(),
AGI = col_character(),
At_symbol = col_character(),
At_description = col_character(),
perc_ID = col_double(),
aln_length = col_integer(),
mismatch = col_integer(),
gap_open = col_integer(),
qstart = col_integer(),
qend = col_integer(),
sstart = col_integer(),
send = col_integer(),
eval = col_double(),
score = col_double()
)
|==== | 5%
|===== | 7%
|====== | 8%
|======= | 9%
|======== | 10%
|========= | 11%
|========== | 13%
|========== | 14% 1 MB
|=========== | 15% 1 MB
|=========== | 16% 1 MB
|============ | 17% 1 MB
|============= | 19% 1 MB
|============== | 20% 1 MB
|=============== | 21% 1 MB
|================ | 22% 1 MB
|================= | 24% 1 MB
|================== | 25% 1 MB
|================== | 26% 1 MB
|=================== | 27% 2 MB
|==================== | 29% 2 MB
|===================== | 30% 2 MB
|====================== | 31% 2 MB
|======================= | 32% 2 MB
|======================== | 34% 2 MB
|========================= | 35% 2 MB
|========================= | 36% 2 MB
|========================== | 37% 2 MB
|=========================== | 38% 2 MB
|============================ | 40% 2 MB
|============================= | 41% 3 MB
|============================== | 42% 3 MB
|=============================== | 43% 3 MB
|=============================== | 44% 3 MB
|================================ | 46% 3 MB
|================================= | 47% 3 MB
|================================== | 48% 3 MB
|=================================== | 49% 3 MB
|==================================== | 51% 3 MB
|===================================== | 52% 3 MB
|===================================== | 53% 3 MB
|====================================== | 54% 4 MB
|======================================= | 55% 4 MB
|======================================== | 57% 4 MB
|========================================= | 58% 4 MB
|========================================== | 59% 4 MB
|=========================================== | 60% 4 MB
|=========================================== | 61% 4 MB
|============================================ | 62% 4 MB
|============================================= | 64% 4 MB
|============================================== | 65% 4 MB
|=============================================== | 66% 4 MB
|================================================ | 67% 5 MB
|================================================= | 69% 5 MB
|================================================= | 70% 5 MB
|================================================== | 71% 5 MB
|=================================================== | 72% 5 MB
|==================================================== | 73% 5 MB
|===================================================== | 75% 5 MB
|====================================================== | 76% 5 MB
|======================================================= | 77% 5 MB
|======================================================== | 78% 5 MB
|======================================================== | 80% 5 MB
|========================================================= | 81% 6 MB
|========================================================== | 82% 6 MB
|=========================================================== | 83% 6 MB
|============================================================ | 84% 6 MB
|============================================================= | 86% 6 MB
|============================================================= | 87% 6 MB
|============================================================== | 88% 6 MB
|=============================================================== | 89% 6 MB
|================================================================ | 90% 6 MB
|================================================================= | 92% 6 MB
|================================================================== | 93% 6 MB
|=================================================================== | 94% 7 MB
|==================================================================== | 95% 7 MB
|==================================================================== | 97% 7 MB
|===================================================================== | 98% 7 MB
|======================================================================| 99% 7 MB
|=======================================================================| 100% 7 MB
BrapaAnnotation
UN_annotated <- lapply(1:nrow(bayesint_result_UN),function(row) {
qtl <- bayesint_result_UN[row,]
subset(BrapaAnnotation, chrom==qtl$chr &
start >= qtl$eQTL_start &
end <= qtl$eQTL_end)
}
)
names(UN_annotated) <- bayesint_result_UN$gene
UN_annotated <- bind_rows(UN_annotated,.id="MR_gene") %>%
left_join(bayesint_result_UN,by=c("MR_gene"="gene","chrom"="chr")) %>% #get eQTL LOD
left_join(MR_UN_genes,by=c("MR_gene"="name")) %>% # get cutoff
dplyr::rename(eQTL_candidate=name)
UN_annotated_small <- UN_annotated %>% select(MR_gene,MR_Cutoff,eQTL_chrom=chrom, eQTL_start, eQTL_end, eQTL_candidate,ends_with("LOD"))
UN_annotated_small
write_csv(UN_annotated_small, path=str_c("../output/",
filebase,
"_MR_eQTL_UN_ALL_CIM_",
Sys.Date(), ".csv"))
cis eQTL?
rr UN_annotated_small %>% filter(MR_gene==eQTL_candidate)
given bayesint results, find overlaps with UN growth QTL
rr UN_MReQTL_QTL_combined <- inner_join(QTLgenes,UN_annotated_small,by=c(=_candidate)) %>% select(.id, MR_gene, MR_Cutoff, eQTL_start, eQTL_end, ends_with(), everything()) %>% arrange(.id,desc(max_eQTL_LOD)) %>% dplyr::rename(eQTL_candidate=name) UN_MReQTL_QTL_combined r UN_MReQTL_QTL_combined_small <- UN_MReQTL_QTL_combined %>% filter(!duplicated(eQTL_candidate)) %>% select(-MR_gene,-MR_Cutoff) UN_MReQTL_QTL_combined_small
cis eQTL?
rr UN_MReQTL_QTL_combined %>% filter(MR_gene==eQTL_candidate) %>% arrange(MR_gene)
Total number of MR genes with an eQTL that overlaps with an FVT
rr UN_MReQTL_QTL_combined %>% select(MR_gene) %>% unique()
how many QTL have at least some overlap?
rr length(unique(QTLgenes$.id))
[1] 16
rr length(unique(UN_MReQTL_QTL_combined$.id))
[1] 10
10 of 16
rr write_csv(UN_MReQTL_QTL_combined, path=str_c(../output/, filebase, _MR_eQTL_UN_overlap_CIM_, Sys.Date(), .csv))
How to assess if overlap is significant?
I think pull regions of same size as eQTL and ask how often they overlap with growth QTL.
For each eQTL, randomly select a chromosome, then a position, and widen based on interval. Then check overlap. Repeat.
Make table of chromosome info
rr chr.info <- scanone_CIM_UN %>% as.data.frame() %>% rownames_to_column() %>% select(marker) %>% separate(marker,into=c(,),sep=,convert=TRUE) %>% group_by(chr) %>% summarize(start=min(bp),end=max(bp))
Make a table of QTL info
rr qtl.info <- QTLgenes %>% group_by(.id) %>% summarize(chrom=unique(chrom),start=min(start),end=max(end)) qtl.info r qtl.ranges <- GRanges(seqnames = qtl.info\(chrom,ranges=IRanges(start=qtl.info\)start,end=qtl.info$end)) qtl.ranges <- GenomicRanges::reduce(qtl.ranges)
The eQTL are in Bayesint_results
rr sims <- 1000 eQTL.ranges <- GRanges(bayesint_result_UN\(chr, ranges = IRanges(start=bayesint_result_UN\)eQTL_start, end=bayesint_result_UN\(eQTL_end)) eQTL.ranges <- GenomicRanges::reduce(eQTL.ranges) set.seed(54321) sim.results <- sapply(1:sims, function(s) { sim.eQTL <- tibble( chr=sample(chr.info\)chr, size = length(eQTL.ranges), replace = TRUE, prob=chr.info\(end/sum(chr.info\)end)), width=width(eQTL.ranges) # width of the QTL to simulate ) sim.eQTL <- chr.info %>% select(chr,chr.start=start,chr.end=end) %>% right_join(sim.eQTL,by=) #need to get the chrom end so we can sample correctly sim.eQTL <- sim.eQTL %>% mutate(qtl.start = runif(n=n(), min = chr.start, max= max(chr.start,chr.end-width)), qtl.end=qtl.start+width) sim.eQTL.ranges <- GRanges(seqnames = sim.eQTL\(chr,ranges = IRanges(start=sim.eQTL\)qtl.start,end=sim.eQTL$qtl.end)) suppressWarnings(result <- sum(countOverlaps(qtl.ranges,sim.eQTL.ranges)>0)) result })
rr true.overlap <- sum(countOverlaps(qtl.ranges,eQTL.ranges)) #OK to ignore warnings
Each of the 2 combined objects has sequence levels not in the other:
- in 'x': A05
- in 'y': A04, A01
Make sure to always combine/compare objects based on the same reference
genome (use suppressWarnings() to suppress this warning).
rr true.overlap
[1] 6
rr mean(sim.results >= true.overlap)
[1] 0.003
rr tibble(FVTQTL_vs_MReQTL_True_Overlaps=true.overlap, N_Simulations_fewer_overlaps=sum(sim.results < true.overlap), N_Simulations_greater_equal_overlaps=sum(sim.results >= true.overlap), P_value=mean(sim.results >= true.overlap) ) %>% write_csv(str_c(../output/, filebase, _MReQTL_overlap_pval_CIM, Sys.Date(), .csv))
significant at p = 0.003; only 0.3% of the simulations had as many overlaps as in the true data set.